# sgemm_common_128x128

# Copyright 2014 Nervana Systems Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#    http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>];
--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>];

LOOP:

<CODE>

    our @top;
    our %insert;

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }

    my $out = join '', @top;
    
    foreach my $j (0 .. 7)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 8;
        my $rsPred   = $j == 7 ? '@P0' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $wait   = $c == 0 ? '01' : '--';

            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;

</CODE>

--:-:-:-:1      IADD loop, loop, 1;
--:-:-:-:1      IADD ta, ta, param_ldaz;
--:-:-:-:1      IADD tb, tb, param_ldbz;
--:-:-:-:3      MOV  k, param_k;
--:-:-:-:1      ISETP.LT.AND P1, PT, loop, param_loops, PT;
--:-:-:-:6      LEA      trackA0.CC, ta, param_A[0],     2;
--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
--:-:-:-:6      LEA      trackB0.CC, tb, param_B[0],     2;
--:-:-:-:0      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
--:-:-:Y:5  @P1 BRA.U REMAINDER;

--:-:1:-:1      S2R blockA, SR_CTAID.Y;
--:-:2:-:1      S2R blockB, SR_CTAID.Z;
--:-:3:-:1      S2R blockZ, SR_CTAID.X;
<SCHEDULE_BLOCK>

--:-:-:-:1      LOP.AND tid_31,  tid, 31;
--:-:-:-:1      LOP.AND tid_96,  tid, 96;
--:-:-:-:1      LOP.AND tid_128, tid, 128;

// writeCs = (readAs / 4) * 128 + readBs;
--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;

// cx = tid_31 | (tid_128 >> 2);
--:-:-:-:1      SHR.U32  cx00, tid_128, 2;
--:-:-:-:1      LOP.OR   cx00, tid_31,  cx00;

// readCs = ((tid_96 << 4) | cx) << 2;
--:-:-:-:1      SHL      readCs, tid_96,  4;
--:-:-:-:1      LOP.OR   readCs, readCs, cx00;
--:-:-:-:1      SHL      readCs, readCs, 2;

// cx += blockB*128;
02:-:-:-:1      ISCADD  cx00, blockB, cx00, 7;
--:-:-:-:1      IADD    cx64, cx00, 64;

// cy = blockA*128 + (tid_96 >> 1)
--:-:-:-:1      SHR.U32 cy00, tid_96, 1;
01:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;

// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
--:-:-:-:1      MOV  ldc,  param_ldc;
--:-:-:-:1      MOV  ldcz, param_ldcz;
--:-:-:-:1      XMAD.LO  ci, ldc,  cy00,   cx00, xmad_c;
04:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;

--:-:-:-:1      SHL  ldc1, ldc, 2;
--:-:-:-:1      SHL  ldc4, ldc, 4;
--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;

--:-:-:-:1      MOV alpha, param_alpha;
--:-:-:-:1      MOV beta,  param_beta;
--:-:-:-:1      MOV flags, param_flags;

// Apply beta
--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT;

// Seed the Tausworthe
--:-:-:-:1      LOP.XOR lfsr0, seed, cx00;
--:-:-:-:1      CS2R lfsr1, SR_CLOCKLO;
--:-:-:-:1      CS2R lfsr2, SR_GLOBALTIMERLO;
--:-:-:-:1      LOP.AND clk_shf1, lfsr1, 31;
--:-:-:-:1      LOP.AND clk_shf2, lfsr2, 31;
--:-:-:-:1      LOP.XOR clk_shf1, clk_shf1, tid_31;
--:-:-:-:1      LOP.XOR clk_shf2, clk_shf2, tid_31;
--:-:-:-:1      SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1;
--:-:-:-:1      SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number with the requested mantissa.
// exp_scale = (127 - 32 - (flags>>16)) << 23
--:-:-:-:1      SHR.U32 mantissa_bits, flags, 16;
--:-:-:-:1      IADD    exp_scale, -mantissa_bits,  95;
--:-:-:-:1      SHL     exp_scale,  exp_scale, 23;

// Truncation mask used after round amount is added.
// trunc_mask = 0xffffffff << (23 - mantissa_bits)
--:-:-:-:1      IADD mantissa_bits, -mantissa_bits, 23;
--:-:-:-:1      MOV  trunc_mask, -1;
--:-:-:-:1      SHL  trunc_mask, trunc_mask, mantissa_bits;

</SCHEDULE_BLOCK>

--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
--:-:-:-:1      MOV d0, RZ;
--:-:-:-:1      IADD   cy04, cy00,  4;
--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
--:-:-:-:1      MOV d1, RZ;
--:-:-:-:1      IADD   cy08, cy00,  8;
--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
--:-:-:-:1      MOV d2, RZ;
--:-:-:-:1      MOV d3, RZ;
--:-:-:-:1      IADD   cy12, cy00,  12;
--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;

--:-:-:-:5      BAR.SYNC 0;

<CODE>

    my $out;
    foreach my $y (0..7)
    {
        $out .=
            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;

        $out .= sprintf(
            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
            ($y) x 8);

        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

--:-:-:-:1  @P4 STG.E.CS [Rand], seed;
--:-:-:-:5      EXIT;

STORE_C:

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;

--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;

--:-:1:-:1  @P0 LDG.E d0, [C00y + 4x<00>];
--:-:2:-:1  @P1 LDG.E d1, [C00y + 4x<64>];
--:-:3:-:1  @P2 LDG.E d2, [C04y + 4x<00>];
--:-:4:-:1  @P3 LDG.E d3, [C04y + 4x<64>];

--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;

--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
--:-:-:-:1      IADD cy00, cy00, 1;
--:-:-:-:1      IADD cy04, cy04, 1;

// Apply relu
--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;

// beta != 0
--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; 

<ORDERED>
--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
--:-:-:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
--:-:5:-:1      LDS c1, [readCs + 4x<0*128 + 64>];

// Stochastic Round flag
--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 1;

--:-:-:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
--:-:6:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
</ORDERED>
</SCHEDULE_BLOCK>

11:-:-:-:1      FFMA c0, d0, beta, c0;
02:-:-:-:1      FFMA c1, d1, beta, c1;
24:-:-:-:1      FFMA c2, d2, beta, c2;
08:-:-:-:0      FFMA c3, d3, beta, c3;

--:-:-:-:5 @!P4 BRA.U END_ROUND1;

<SCHEDULE_BLOCK>
// Strip mantissa and leave sign+exponent
--:-:-:-:1      LOP32I.AND exp0, c0, 0xff800000;
--:-:-:-:1      LOP32I.AND exp1, c1, 0xff800000;
--:-:-:-:1      LOP32I.AND exp2, c2, 0xff800000;
--:-:-:-:1      LOP32I.AND exp3, c3, 0xff800000;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number as an fp16
// exp *= 2^-10 * 2^-32  (2^-42)
--:-:-:-:1      FMUL exp0, exp0, exp_scale;
--:-:-:-:1      FMUL exp1, exp1, exp_scale;
--:-:-:-:1      FMUL exp2, exp2, exp_scale;
--:-:-:-:1      FMUL exp3, exp3, exp_scale;

// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19);
--:-:-:-:1      LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe;
--:-:-:-:1      SHL lfsr0_1, lfsr0_1, 12;
--:-:-:-:1      SHL lfsr0_2, lfsr0, 13;
--:-:-:-:1      LOP.XOR lfsr0_2, lfsr0_2, lfsr0;
--:-:-:-:1      SHR.U32 lfsr0_2, lfsr0_2, 19;
--:-:-:-:1      LOP.XOR lfsr0, lfsr0_1, lfsr0_2;

// lfsr1 = ((lfsr1 & 0xfffffff8) <<  4) ^ (((lfsr1 << 2)  ^ lfsr1) >> 25);
--:-:-:-:1      LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8;
--:-:-:-:1      SHL lfsr1_1, lfsr1_1, 4;
--:-:-:-:1      SHL lfsr1_2, lfsr1, 2;
--:-:-:-:1      LOP.XOR lfsr1_2, lfsr1_2, lfsr1;
--:-:-:-:1      SHR.U32 lfsr1_2, lfsr1_2, 25;
--:-:-:-:1      LOP.XOR lfsr1, lfsr1_1, lfsr1_2;

// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3)  ^ lfsr2) >> 11);
--:-:-:-:1      LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0;
--:-:-:-:1      SHL lfsr2_1, lfsr2_1, 11;
--:-:-:-:1      SHL lfsr2_2, lfsr2, 3;
--:-:-:-:1      LOP.XOR lfsr2_2, lfsr2_2, lfsr2;
--:-:-:-:1      SHR.U32 lfsr2_2, lfsr2_2, 11;
--:-:-:-:1      LOP.XOR lfsr2, lfsr2_1, lfsr2_2;

// rand = lfsr0 ^ lfsr1 ^ lfsr2;
// generate 3 other rotations of this rand
--:-:-:-:1      LOP3.LUT  seed, lfsr0, lfsr1, lfsr2, 0x96;
--:-:-:-:1      SHF.R.U64 rand1, seed,  4, seed;
--:-:-:-:1      SHF.R.U64 rand2, seed,  8, seed;
--:-:-:-:1      SHF.R.U64 rand3, seed, 12, seed;

// Convert rand to float
--:-:1:-:1      I2F.F32.U32.RZ rand0, seed;
--:-:2:-:1      I2F.F32.U32.RZ rand1, rand1;
--:-:3:-:1      I2F.F32.U32.RZ rand2, rand2;
--:-:4:-:1      I2F.F32.U32.RZ rand3, rand3;

// Scale the random number so msb is one below lsb of fp16
// Add scaled random to number to round
01:-:-:-:1      FFMA.RZ c0, rand0, exp0, c0;
02:-:-:-:1      FFMA.RZ c1, rand1, exp1, c1;
04:-:-:-:1      FFMA.RZ c2, rand2, exp2, c2;
08:-:-:-:1      FFMA.RZ c3, rand3, exp3, c3;

// Truncate number to fp16
--:-:-:-:1      LOP.AND c0, c0, trunc_mask;
--:-:-:-:1      LOP.AND c1, c1, trunc_mask;
--:-:-:-:1      LOP.AND c2, c2, trunc_mask;
--:-:-:-:1      LOP.AND c3, c3, trunc_mask;
</SCHEDULE_BLOCK>

END_ROUND1:

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;

// Stochastic Round flag
--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 1;

--:-:-:-:1  @P0 STG.E.CG [C00y0 + 4x<00>], c0;
--:5:-:-:1  @P1 STG.E.CG [C00y0 + 4x<64>], c1;
--:-:-:-:1  @P2 STG.E.CG [C04y0 + 4x<00>], c2;
--:6:-:-:1  @P3 STG.E.CG [C04y0 + 4x<64>], c3;

--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;

--:-:1:-:1  @P0 LDG.E d0, [C08y0 + 4x<00>];
--:-:2:-:1  @P1 LDG.E d1, [C08y0 + 4x<64>];
--:-:3:-:1  @P2 LDG.E d2, [C12y0 + 4x<00>];
--:-:4:-:1  @P3 LDG.E d3, [C12y0 + 4x<64>];

--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;

--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
</SCHEDULE_BLOCK>

10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
--:-:-:-:1      IADD   cy08, cy08, 1;
--:-:-:-:1      IADD   cy12, cy12, 1;
--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;

--:-:-:-:1      LDS c0, [readCs + 4x<2*128 + 00>];
--:-:5:-:1      LDS c1, [readCs + 4x<2*128 + 64>];
--:-:-:-:1      LDS c2, [readCs + 4x<3*128 + 00>];
--:-:6:-:1      LDS c3, [readCs + 4x<3*128 + 64>];

11:-:-:-:1      FFMA c0, d0, beta, c0;
02:-:-:-:1      FFMA c1, d1, beta, c1;
24:-:-:-:1      FFMA c2, d2, beta, c2;
08:-:-:-:0      FFMA c3, d3, beta, c3;

--:-:-:-:5 @!P6 BRA.U END_ROUND2;

<SCHEDULE_BLOCK>
// Strip mantissa and leave sign+exponent
--:-:-:-:1      LOP32I.AND exp0, c0, 0xff800000;
--:-:-:-:1      LOP32I.AND exp1, c1, 0xff800000;
--:-:-:-:1      LOP32I.AND exp2, c2, 0xff800000;
--:-:-:-:1      LOP32I.AND exp3, c3, 0xff800000;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number as an fp16
// exp *= 2^-10 * 2^-32  (2^-42)
--:-:-:-:1      FMUL exp0, exp0, exp_scale;
--:-:-:-:1      FMUL exp1, exp1, exp_scale;
--:-:-:-:1      FMUL exp2, exp2, exp_scale;
--:-:-:-:1      FMUL exp3, exp3, exp_scale;

// generate 4 other rotations of this seed
--:-:-:-:1      SHF.R.U64 rand0, seed, 16, seed;
--:-:-:-:1      SHF.R.U64 rand1, seed, 20, seed;
--:-:-:-:1      SHF.R.U64 rand2, seed, 24, seed;
--:-:-:-:1      SHF.R.U64 rand3, seed, 28, seed;

// Convert rand to float
--:-:1:-:1      I2F.F32.U32.RZ rand0, rand0;
--:-:2:-:1      I2F.F32.U32.RZ rand1, rand1;
--:-:3:-:1      I2F.F32.U32.RZ rand2, rand2;
--:-:4:-:1      I2F.F32.U32.RZ rand3, rand3;

// Scale the random number so msb is one below lsb of fp16
// Add scaled random to number to round
01:-:-:-:1      FFMA.RZ c0, rand0, exp0, c0;
02:-:-:-:1      FFMA.RZ c1, rand1, exp1, c1;
04:-:-:-:1      FFMA.RZ c2, rand2, exp2, c2;
08:-:-:-:1      FFMA.RZ c3, rand3, exp3, c3;

// Truncate number to fp16
--:-:-:-:1      LOP.AND c0, c0, trunc_mask;
--:-:-:-:1      LOP.AND c1, c1, trunc_mask;
--:-:-:-:1      LOP.AND c2, c2, trunc_mask;
--:-:-:-:1      LOP.AND c3, c3, trunc_mask;
</SCHEDULE_BLOCK>

END_ROUND2:

--:-:-:-:0      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0

01:-:-:-:1  @P0 STG.E.CG [C08y0 + 4x<00>], c0;
02:5:-:-:1  @P1 STG.E.CG [C08y0 + 4x<64>], c1;

// Stochastic Round flag
--:-:-:-:0      LOP.AND.NZ P4, RZ, flags, 1;

04:-:-:-:1  @P2 STG.E.CG [C12y0 + 4x<00>], c2;
08:6:-:-:1  @P3 STG.E.CG [C12y0 + 4x<64>], c3;

10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;

--:-:-:-:5      RET;
